from re import A
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import norm
import scipy.stats as stats
import statsmodels.api as sm

def getTruth(dataset,bvarb='isBlack',outcome='audited'):
    """Returns true disparity.
    Parameters
    ----------
    dataset : Pandas Dataframe
        dataset to use
    bvarb : string
        name of variable for indicator for Black
    outcome : string
        name of variable for outcome
    """
    return dataset[dataset[bvarb]==True][outcome].mean()-dataset[dataset[bvarb]==False][outcome].mean()

def chenEstimate(dataset, pbvarb='pBlack', outcome='audited', wvarb=None):
    """Computes weighted estimator. Assumes a binary protected class, for now.
    Parameters
    ----------
    dataset : Pandas Dataframe
        dataset to use
    pbvarb : string
        name of variable for probability Black
    outcome : string
        name of variable for outcome
    """
    
    if wvarb is not None:
        return (dataset[pbvarb]*dataset[outcome]*dataset[wvarb]).sum()/(dataset[pbvarb]*dataset[wvarb]).sum()-((1-dataset[pbvarb])*dataset[outcome]*dataset[wvarb]).sum()/((1-dataset[pbvarb])*dataset[wvarb]).sum()
    else:
        return (dataset[pbvarb]*dataset[outcome]).sum()/(dataset[pbvarb]).sum()-((1-dataset[pbvarb])*dataset[outcome]).sum()/(1-dataset[pbvarb]).sum()

def regEstimate(dataset,pbvarb='pBlack',outcome='audited', wvarb=None):
    """gets estimate and standard error of linear estimator using OLS.
    Parameters
    ----------
    dataset : Pandas Dataframe
        which dataset to use
    pbvarb : string
        variable name for probability Black
    outcome : string
        variable name for outcome 
    """
    if wvarb is not None: 
        model =  smf.wls(outcome + ' ~ ' + pbvarb, dataset, weights=dataset[wvarb]).fit(cov_type='HC1')
        coef= model.params[pbvarb]
        se = model.bse[pbvarb]
        return coef, se

    else:
        model =  smf.ols(outcome + ' ~ ' + pbvarb, dataset).fit(cov_type='HC1')
        coef= model.params[pbvarb]
        se = model.bse[pbvarb]
        return coef, se

def getWVar(values, weights):
    ''' 
    get weighted variance of a variable

    Parameters
    ----------
    values: Numpy ndarray
          variable whose variance we want
    weights: Numpy ndarray (same dimensions as above)
          weights to use
    '''

    average = np.average(values, weights=weights)
    variance = np.average((values-average)**2, weights=weights)
    return variance

def getSEMultiplier(dataset, pbvarb='pBlack', wvarb=None):
    """Get standard error multiplier from linear estimator to weighted estimator. 
    Parameters
    ----------
    dataset : Pandas Dataframe
        dataset which will be used
    pbvarb : string
        name of variable with probability Black
    """

    if wvarb is not None:
        #return np.sqrt(getWVar(dataset[pbvarb], dataset[wvarb])/((dataset[pbvarb]*dataset[wvarb]).mean()*((1-dataset[pbvarb])*dataset[wvarb]).mean()))
        return np.sqrt(getWVar(dataset[pbvarb], dataset[wvarb])/(np.average(dataset[pbvarb], weights=dataset[wvarb])*np.average(1-dataset[pbvarb], weights=dataset[wvarb])))
    else:
        return np.sqrt(dataset[pbvarb].var()/(dataset[pbvarb].mean()*(1-dataset[pbvarb].mean())))

def getSEs(dataset, pbvarb='pBlack', outcome='audited', wvarb=None):
    """Compute both linear and weighted standard errors.
    Parameters
    ----------
    dataset : Pandas Dataframe
        dataset which will be used
    pbvarb : string
        variable name for probability Black
    outcome : string
        variable name for outcome 
    """
    seMultiplier=getSEMultiplier(dataset,pbvarb,wvarb=wvarb)
    seReg = regEstimate(dataset,pbvarb,outcome, wvarb=wvarb)[1]
    seChen = seReg*seMultiplier
    return seChen, seReg

def generateBaseDataset(n,prior_likely_black = 0.2,prior_likely_nonblack=0.7,mean_black_prob_likely=0.9,mean_nonblack_prob_likely=0.1):
    """ Create baseline dataset to work with basd on specified DGP.
    Parameters
    ----------
    n : int
        number of individuals in dataset
    prior_likely_black : float
        probability that individual's *probability* of being Black is drawn from the high-Black-likelihood distribution.
    prior_likely_nonblack : float
        probability that individual's *probabillity* of being *non-Black* is drawn from the low-Black-likelihood distribution.
        N.B. need not add with prior_likely_black to 1 - remainder is coin flip.
    mean_black_prob_likely : float
        avg probability Black for taxpayer from high-Black-likelihood distribution
    mean_nonblack_prob_likely : float
        avg probability Black for taxpayer from low-Black-likelihood distribution
    """
    means = [mean_black_prob_likely if np.random.uniform()<prior_likely_black else mean_nonblack_prob_likely for i in range(n)]
    means = [means[i] if np.random.uniform()<(prior_likely_black+prior_likely_nonblack) else 0.5 for i in range(n)]
    probsBlack = [min(max(0,np.random.normal()*0.03+means[i]),1) for i in range(n)]
    isBlack = [1 if np.random.uniform()<probsBlack[i] else 0 for i in range(n)]
    return pd.DataFrame({'isBlack':isBlack,'pBlack':probsBlack})

def getAudits(dataset,coefBlack=0.01,coefProb=0.01,noiseFactor=0.005,baseRate=0.005):
    """Generates audit probability and audit indicator based on DGP.
    Parameters
    ----------
    coefBlack : float
        audit probability penalty for Black individual
    coefProb : float
        audit probability penalty per percentage point of probability Black (independent of realization)
    noiseFactor : float
        scaling coefficient for random noise added to probability
    baseRate : float
        underlying probability of audit (i.e. non-Black 0 pct prob Black mean audit rate)
    """
    dataset['auditProb'] = dataset['isBlack']*coefBlack+dataset['pBlack']*coefProb+np.random.normal(size=len(dataset))*noiseFactor + baseRate
    dataset['auditProb'] = dataset['auditProb'].clip(upper=1,lower=0)
    dataset['audited'] = (np.random.uniform(size=len(dataset))<dataset['auditProb']).astype(int)

def generateAdditionalAudits(dataset,n_audit_realizations):
    """Generates fresh realization of audits based on audit probability variable. For now, dataset must have variable named "auditProb". 
    Paramseters
    -----------
    dataset: Pandas Dataframe
        dataset which you want the new audits to appear on
    n_audit_realizations:
        number of new audit realizations to generate    
    """
    newAudits = [(np.random.uniform(size=len(dataset))<dataset['auditProb']).astype(int) for i in range(n_audit_realizations)]
    old_colnames = list(dataset.columns)
    new_colnames = ['audited_'+str(i) for i in range(n_audit_realizations)]
    old_colnames.extend(new_colnames)
    dataset = pd.concat([dataset]+newAudits,axis=1)
    dataset.columns = old_colnames
    return dataset 


def drawImputedSubsample(dataset,ss,pbvarb='pBlack',newVarbName='isBlack'):
    """Draws a new sample from a dataset with ss observations, realizing the race of an individual by drawing Bernoulli from the estimated probability.
    Parameters
    ----------
    dataset :  Pandas Dataframe
        dataset which you want to do the bootstrapping/realizatioon from
    ss : int
        subsample size
    pbvarb : string
        name of the (BISG/etc-generated) probability an individual is Black
    newVarbName : string
        desired name for the new variable storing the realized race
     """
    newdata = dataset.sample(ss,replace=True)
    newdata[newVarbName] = (np.random.uniform(size=len(newdata))<newdata[pbvarb]).astype(int)
    return newdata

def drawImputedFullSample(dataset,pbvarb='pBlack'):
    """ creates a copy fo the dataset with new realization of race dummies.
    Parameters
    ----------
    dataset : Pandas Dataframe
        dataset you want to use (will get copy back, not original)
    pbvarb : string
        name of variable capturing the probability taxpayer is Black
    """
    newdata = dataset.copy()
    newdata['isBlack'] = (np.random.uniform(size=len(newdata))<newdata[pbvarb]).astype(int)
    return newdata

def getObservationFromImputed(dataset,outcome='audited',bvarb='isBlack'):
    """ does method of composition step. That is, fits estimate of mean and std err of race dummy on audit probability, and returns draw from posterior.
    Parameters
    ----------
    dataset :  Pandas Dataframe
        dataset with outcome and dummy variable
    outcome : string
        outcome variable name (e.g. audited but could be other)
    bvarb : string
        dummy variable name (e.g. isBlack but could be other)
    """
    fitted = smf.ols(outcome+' ~ ' + bvarb,dataset).fit()
    mean = fitted.params[bvarb]
    std  = fitted.bse[bvarb]
    draw = norm.rvs(loc=mean,scale=std,size=1)
    return draw
